library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## -- Attaching packages ---------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.1.0 v readr 1.3.1
## v tibble 1.4.2 v purrr 0.2.5
## v tidyr 0.8.2 v stringr 1.3.1
## v ggplot2 3.1.0 v forcats 0.4.0
## -- Conflicts ------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(ggplot2)
library(cluster)
library(factoextra)
## Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ
domestict20batting <- read.csv("D:\\Vishal\\III year\\Data Analytics\\Assignment II\\Player Ratings\\domestict20careerbattingrating_mod.csv")
head(domestict20batting)
## Name Matches Innings Not_Outs Runs High_Score Average No_Of_100
## 1 C H Gayle 256 251 34 9120 175 42.02 17
## 2 V Kohli 204 192 38 6446 113 41.85 4
## 3 B J Hodge 256 242 55 6997 106 37.41 2
## 4 D A Warner 220 219 22 6868 135 34.86 5
## 5 K A Pollard 312 279 84 6095 89 31.25 0
## 6 S K Rai0 242 228 37 6326 109 33.12 3
## No_Of_50 Strike_Rate Catches_Taken Runs.Innings Rating ScaledRating
## 1 57 150.34 64 36.33466 188036.28 20.82
## 2 46 133.01 89 33.57292 121996.95 18.69
## 3 46 131.72 96 28.91322 116714.79 18.48
## 4 54 143.68 96 31.36073 110581.51 18.24
## 5 30 152.71 177 21.84588 104085.82 17.96
## 6 36 139.00 124 27.74561 98915.61 17.73
## LogRating Stumpings
## 1 5.274242 NA
## 2 5.086349 NA
## 3 5.067126 NA
## 4 5.043683 NA
## 5 5.017392 NA
## 6 4.995265 NA
summary(domestict20batting)
## Name Matches Innings Not_Outs
## A Singh : 2 Min. : 1.00 Min. : 0.00 Min. : 0.00
## S Sharma : 2 1st Qu.: 30.00 1st Qu.: 17.00 1st Qu.: 4.00
## Yuvraj Singh : 2 Median : 60.00 Median : 38.00 Median : 9.00
## A A Bilakhia : 1 Mean : 80.75 Mean : 59.58 Mean :13.28
## A A Chavan : 1 3rd Qu.:119.00 3rd Qu.: 80.00 3rd Qu.:18.00
## A A Jhunjhunwala: 1 Max. :312.00 Max. :279.00 Max. :86.00
## (Other) :477
## Runs High_Score Average No_Of_100
## Min. : 0.0 Min. : 0.00 Min. : 0.00 Min. : 0.0000
## 1st Qu.: 102.2 1st Qu.: 22.00 1st Qu.:10.57 1st Qu.: 0.0000
## Median : 476.0 Median : 56.50 Median :19.52 Median : 0.0000
## Mean :1156.4 Mean : 57.17 Mean :18.52 Mean : 0.3025
## 3rd Qu.:1394.5 3rd Qu.: 86.00 3rd Qu.:25.29 3rd Qu.: 0.0000
## Max. :9120.0 Max. :175.00 Max. :56.00 Max. :17.0000
##
## No_Of_50 Strike_Rate Catches_Taken Runs.Innings
## Min. : 0.000 Min. : 0.0 Min. : 0.00 Min. : 0.000
## 1st Qu.: 0.000 1st Qu.:102.3 1st Qu.: 8.00 1st Qu.: 5.609
## Median : 1.000 Median :119.0 Median : 20.00 Median :14.171
## Mean : 5.527 Mean :113.1 Mean : 27.87 Mean :14.049
## 3rd Qu.: 6.000 3rd Qu.:129.1 3rd Qu.: 38.00 3rd Qu.:21.298
## Max. :57.000 Max. :190.9 Max. :177.00 Max. :36.335
##
## Rating ScaledRating LogRating Stumpings
## Min. : 0.0 Min. : 0.000 Min. :-1.148 Min. : 1.00
## 1st Qu.: 863.3 1st Qu.: 5.420 1st Qu.: 2.936 1st Qu.: 4.00
## Median : 4908.2 Median : 8.370 Median : 3.691 Median : 9.50
## Mean : 14462.3 Mean : 8.559 Mean : 3.461 Mean :13.65
## 3rd Qu.: 15490.2 3rd Qu.:11.158 3rd Qu.: 4.190 3rd Qu.:16.00
## Max. :188036.3 Max. :20.820 Max. : 5.274 Max. :77.00
## NA's :432
set.seed(20)
domesticBatCluster <- kmeans(domestict20batting[, 2:15], 3)
domesticBatCluster$cluster <- as.factor(domesticBatCluster$cluster)
#domesticBatCluster$cluster
ggplot(domestict20batting, aes(Innings, Runs/Innings, color = domesticBatCluster$cluster)) +
geom_point(size = 2) +
scale_color_hue(labels = c("Best players", "Good players", "Bad Players")) +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
ggtitle("Domestic T20 Batting Average")
## Warning: Removed 3 rows containing missing values (geom_point).

library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
p <- plot_ly(domestict20batting, x = ~Innings, y = ~Runs/Innings, type = 'scatter',
mode = 'markers', color = domesticBatCluster$cluster,
text = ~paste('Name: ', Name)) %>%
layout(title = "Cluster of Averages (batsmen)")
p
## Warning: Ignoring 3 observations
head(domestict20batting)
## Name Matches Innings Not_Outs Runs High_Score Average No_Of_100
## 1 C H Gayle 256 251 34 9120 175 42.02 17
## 2 V Kohli 204 192 38 6446 113 41.85 4
## 3 B J Hodge 256 242 55 6997 106 37.41 2
## 4 D A Warner 220 219 22 6868 135 34.86 5
## 5 K A Pollard 312 279 84 6095 89 31.25 0
## 6 S K Rai0 242 228 37 6326 109 33.12 3
## No_Of_50 Strike_Rate Catches_Taken Runs.Innings Rating ScaledRating
## 1 57 150.34 64 36.33466 188036.28 20.82
## 2 46 133.01 89 33.57292 121996.95 18.69
## 3 46 131.72 96 28.91322 116714.79 18.48
## 4 54 143.68 96 31.36073 110581.51 18.24
## 5 30 152.71 177 21.84588 104085.82 17.96
## 6 36 139.00 124 27.74561 98915.61 17.73
## LogRating Stumpings
## 1 5.274242 NA
## 2 5.086349 NA
## 3 5.067126 NA
## 4 5.043683 NA
## 5 5.017392 NA
## 6 4.995265 NA
df <- select(domestict20batting, Average, Strike_Rate, Runs.Innings)
domesticBatCluster1 <- kmeans(df, 3)
domesticBatCluster1$cluster <- as.factor(domesticBatCluster1$cluster)
q <- plot_ly(domestict20batting, x = ~Matches, y = ~Strike_Rate, type = 'scatter',
mode = 'markers', color = domesticBatCluster1$cluster,
text = ~paste('Name: ', Name)) %>%
layout(title = "Cluster of matches vs Strike Rate")
q
str(domestict20batting)
## 'data.frame': 486 obs. of 16 variables:
## $ Name : Factor w/ 483 levels "A A Bilakhia",..: 84 458 70 100 193 388 433 62 333 271 ...
## $ Matches : int 256 204 256 220 312 242 233 236 234 240 ...
## $ Innings : int 251 192 242 219 279 228 221 233 223 216 ...
## $ Not_Outs : int 34 38 55 22 84 37 63 22 38 86 ...
## $ Runs : int 9120 6446 6997 6868 6095 6326 5995 6671 6137 4884 ...
## $ High_Score : int 175 113 106 135 89 109 95 158 109 73 ...
## $ Average : num 42 41.9 37.4 34.9 31.2 ...
## $ No_Of_100 : int 17 4 2 5 0 3 0 7 3 0 ...
## $ No_Of_50 : int 57 46 46 54 30 36 37 32 42 18 ...
## $ Strike_Rate : num 150 133 132 144 153 ...
## $ Catches_Taken: int 64 89 96 96 177 124 92 96 91 122 ...
## $ Runs.Innings : num 36.3 33.6 28.9 31.4 21.8 ...
## $ Rating : num 188036 121997 116715 110582 104086 ...
## $ ScaledRating : num 20.8 18.7 18.5 18.2 18 ...
## $ LogRating : num 5.27 5.09 5.07 5.04 5.02 ...
## $ Stumpings : int NA NA NA NA NA NA NA 14 NA 59 ...
any(is.na(domestict20batting))
## [1] TRUE
domestict20batting_label <- domestict20batting$Stumpings
domestict20batting$Stumpings <- NULL
str(domestict20batting)
## 'data.frame': 486 obs. of 15 variables:
## $ Name : Factor w/ 483 levels "A A Bilakhia",..: 84 458 70 100 193 388 433 62 333 271 ...
## $ Matches : int 256 204 256 220 312 242 233 236 234 240 ...
## $ Innings : int 251 192 242 219 279 228 221 233 223 216 ...
## $ Not_Outs : int 34 38 55 22 84 37 63 22 38 86 ...
## $ Runs : int 9120 6446 6997 6868 6095 6326 5995 6671 6137 4884 ...
## $ High_Score : int 175 113 106 135 89 109 95 158 109 73 ...
## $ Average : num 42 41.9 37.4 34.9 31.2 ...
## $ No_Of_100 : int 17 4 2 5 0 3 0 7 3 0 ...
## $ No_Of_50 : int 57 46 46 54 30 36 37 32 42 18 ...
## $ Strike_Rate : num 150 133 132 144 153 ...
## $ Catches_Taken: int 64 89 96 96 177 124 92 96 91 122 ...
## $ Runs.Innings : num 36.3 33.6 28.9 31.4 21.8 ...
## $ Rating : num 188036 121997 116715 110582 104086 ...
## $ ScaledRating : num 20.8 18.7 18.5 18.2 18 ...
## $ LogRating : num 5.27 5.09 5.07 5.04 5.02 ...
domestict20batting_label <- domestict20batting$Name
domestict20batting$Name <- NULL
str(domestict20batting)
## 'data.frame': 486 obs. of 14 variables:
## $ Matches : int 256 204 256 220 312 242 233 236 234 240 ...
## $ Innings : int 251 192 242 219 279 228 221 233 223 216 ...
## $ Not_Outs : int 34 38 55 22 84 37 63 22 38 86 ...
## $ Runs : int 9120 6446 6997 6868 6095 6326 5995 6671 6137 4884 ...
## $ High_Score : int 175 113 106 135 89 109 95 158 109 73 ...
## $ Average : num 42 41.9 37.4 34.9 31.2 ...
## $ No_Of_100 : int 17 4 2 5 0 3 0 7 3 0 ...
## $ No_Of_50 : int 57 46 46 54 30 36 37 32 42 18 ...
## $ Strike_Rate : num 150 133 132 144 153 ...
## $ Catches_Taken: int 64 89 96 96 177 124 92 96 91 122 ...
## $ Runs.Innings : num 36.3 33.6 28.9 31.4 21.8 ...
## $ Rating : num 188036 121997 116715 110582 104086 ...
## $ ScaledRating : num 20.8 18.7 18.5 18.2 18 ...
## $ LogRating : num 5.27 5.09 5.07 5.04 5.02 ...
domestict20batting_sc <- as.data.frame(scale(domestict20batting))
summary(domestict20batting_sc)
## Matches Innings Not_Outs Runs
## Min. :-1.2271 Min. :-0.9941 Min. :-0.9950 Min. :-0.7375
## 1st Qu.:-0.7809 1st Qu.:-0.7104 1st Qu.:-0.6952 1st Qu.:-0.6723
## Median :-0.3193 Median :-0.3600 Median :-0.3206 Median :-0.4339
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.5886 3rd Qu.: 0.3408 3rd Qu.: 0.3539 3rd Qu.: 0.1519
## Max. : 3.5582 Max. : 3.6614 Max. : 5.4494 Max. : 5.0790
## High_Score Average No_Of_100 No_Of_50
## Min. :-1.53913 Min. :-1.9488 Min. :-0.2746 Min. :-0.57008
## 1st Qu.:-0.94687 1st Qu.:-0.8360 1st Qu.:-0.2746 1st Qu.:-0.57008
## Median :-0.01811 Median : 0.1052 Median :-0.2746 Median :-0.46693
## Mean : 0.00000 Mean : 0.0000 Mean : 0.0000 Mean : 0.00000
## 3rd Qu.: 0.77604 3rd Qu.: 0.7128 3rd Qu.:-0.2746 3rd Qu.: 0.04882
## Max. : 3.17198 Max. : 3.9453 Max. :15.1578 Max. : 5.30946
## Strike_Rate Catches_Taken Runs.Innings Rating
## Min. :-4.0852 Min. :-0.9939 Min. :-1.61390 Min. :-0.6247
## 1st Qu.:-0.3883 1st Qu.:-0.7087 1st Qu.:-0.96955 1st Qu.:-0.5874
## Median : 0.2132 Median :-0.2808 Median : 0.01399 Median :-0.4127
## Mean : 0.0000 Mean : 0.0000 Mean : 0.00000 Mean : 0.0000
## 3rd Qu.: 0.5799 3rd Qu.: 0.3610 3rd Qu.: 0.83268 3rd Qu.: 0.0444
## Max. : 2.8117 Max. : 5.3174 Max. : 2.56007 Max. : 7.4973
## ScaledRating LogRating
## Min. :-2.03858 Min. :-4.1949
## 1st Qu.:-0.74769 1st Qu.:-0.4778
## Median :-0.04508 Median : 0.2091
## Mean : 0.00000 Mean : 0.0000
## 3rd Qu.: 0.61882 3rd Qu.: 0.6634
## Max. : 2.92016 Max. : 1.6501
dist_mat <- dist(domestict20batting_sc, method = 'euclidean')
hclust_avg <- hclust(dist_mat, method = 'average')
plot(hclust_avg)
cut_avg <- cutree(hclust_avg, k = 6)
plot(hclust_avg)
rect.hclust(hclust_avg , k = 6, border = 2:6)
abline(h = 3, col = 'red')

library(dendextend)
##
## ---------------------
## Welcome to dendextend version 1.9.0
## Type citation('dendextend') for how to cite the package.
##
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
##
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## Or contact: <tal.galili@gmail.com>
##
## To suppress this message use: suppressPackageStartupMessages(library(dendextend))
## ---------------------
##
## Attaching package: 'dendextend'
## The following object is masked from 'package:stats':
##
## cutree
avg_dend_obj <- as.dendrogram(hclust_avg)
avg_col_dend <- color_branches(avg_dend_obj, h = 3)
plot(avg_col_dend)

domestict20batting_cl <- mutate(domestict20batting, cluster = cut_avg)
count(domestict20batting_cl, cluster)
## # A tibble: 6 x 2
## cluster n
## <int> <int>
## 1 1 1
## 2 2 9
## 3 3 10
## 4 4 89
## 5 5 376
## 6 6 1
ggplot(domestict20batting_cl, aes(x=Matches, y = Runs, color = factor(cluster))) + geom_point()
